<?php

/**
 * Class ChineseWordTokenizer
 */
class ChineseWordTokenizer {
    protected $_wordsMap;
    /**
     * 遇到空格直接换行
     * @var bool
     */
    protected $_spaceBreak = false;
    /**
     * 中文和字母分开
     * @var bool
     */
    protected $_separatedFromLetters = false;
    
    public function __construct($file=null, $useCache=false, $delimiter="\n") {
        if(is_array($file)) {
            $this->setWords($file);
        } else if($file) {
            $this->loadWords($file, $useCache, $delimiter);
        }
    }
    
    /**
     * @param $words
     * @return $this
     */
    public function setWords($words) {
        $this->_wordsMap = $words;
        return $this;
    }
    
    public function spaceBreak($b=null) {
        $old = $this->_spaceBreak;
        
        if(null !== $b){
            $this->_spaceBreak = $b;
        }
        
        return $old;
    }
    
    public function separatedFromLetters($b=null) {
        $old = $this->_separatedFromLetters;
    
        if(null !== $b){
            $this->_separatedFromLetters = $b;
        }
    
        return $old;
    }
    
    /**
     * 进行分词
     * @param $text
     * @return array
     */
    public function compute($text) {
        if(is_array($text)) {
            $result = [];
            
            foreach ($text as $item) {
                $result = array_merge($result, $this->compute($item));
            }
            
            return $result;
        }
        
        $text = trim($text);
        $text = str_replace("\n", "", $text);
        
        if($this->_spaceBreak) {
            $text = str_replace(" ", "\n", $text);
        }
        
        $replacement = [
            "！", "（", "）", "【", "】", "’", "‘", "“", "”", "。", "、", "、", "，", "；", "：",
            "！", "；", "＇", "．", "，", "＂", "［", "］",
            ",", "!", ";", ":", "\"", "[", "]", "(", ")", "{", "}"
        ];
        $text = str_replace($replacement, "\n", $text);
        $text = array_filter(array_map("trim", explode("\n", $text)));
    
        if(count($text) == 1) {
            $text = $text[0];
        } else {
            return $this->compute($text);
        }
        
        $tmpText = $text;
        $splitOutWords = [];
        $splitHeadCount = 0;
    
        while($len = mb_strlen($tmpText)) {
            for($i=$len; $i>0; $i--) {
                if($i == $len) {
                    $splitOutWords [$len][$splitHeadCount]= $tmpText;
                } else {
                    $splitOutWords [$i][$splitHeadCount]= mb_substr($tmpText, 0, $i);
                }
            }
        
            for($i=1; $i<$len; $i++) {
                $splitOutWords [$len-$i][$i+$splitHeadCount]= mb_substr($tmpText, $i, $len-$i);
            }
        
            $tmpText = mb_substr($tmpText, 1, $len-2);
            $splitHeadCount ++;
        }
    
        krsort($splitOutWords);
        $foundWords = [];
    
        foreach($splitOutWords as $len => $posWords) {
            foreach($posWords as $pos => $posWord) {
                if(!isset($splitOutWords[$len]) || !isset($splitOutWords[$len][$pos])) {
                    continue;
                }
            
                if($this->_findWord($posWord)) {
                    $foundWords[$len][$pos] = $posWord;
                    $this->_removeRelativeWords($splitOutWords, $len, $pos);
                }
            }
        }
    
        $remainingWords = [];
        foreach($splitOutWords as $len => $posWords) {
            foreach($posWords as $pos => $posWord) {
                if(!isset($splitOutWords[$len]) || !isset($splitOutWords[$len][$pos])) {
                    continue;
                }
    
                if($this->_separatedFromLetters) {
                    $this->_wordSeparatedFromLetters($remainingWords, $posWord, $len, $pos);
                } else {
                    $remainingWords[$len][$pos] = $posWord;
                }
                
                $this->_removeRelativeWords($splitOutWords, $len, $pos);
            }
        }
        
        return $this->_mergeResult($foundWords, $remainingWords);
    }
    
    protected function _wordSeparatedFromLetters(&$remainingWords, $word, $len, $pos) {
        $word = preg_replace("#([0-9a-z\. ']+)#i", "\n$1\n", $word);
        $words = array_filter(array_map("trim", explode("\n", $word)));
        
        foreach ($words as $i => $word) {
            $wordLen = mb_strlen($word);
            $remainingWords[$wordLen] [$pos] = $word;
            $pos += $wordLen;
        }
    }
    
    /**
     * 加载词库
     * @param $file
     * @param bool $useCache
     * @param string $delimiter
     * @return ChineseWordTokenizer
     */
    public function loadWords($file, $useCache=false, $delimiter="\n") {
        $cacheFile = "{$file}.cache" . ord($delimiter[0]);
        
        if($useCache && file_exists($cacheFile)) {
            return $this->setWords(unserialize(file_get_contents($cacheFile)));
        }
        
        $content = file_get_contents($file);
        $words = array_filter(array_map("trim", explode($delimiter, $content)));
        $wordsIndex = [];
    
        foreach($words as $i => $word) {
            $wordsIndex[mb_strlen($word)] []= $word;
        }
        
        ksort($wordsIndex);
        
        if($useCache) {
            file_put_contents($cacheFile, serialize($wordsIndex));
        }
        
        return $this->setWords($wordsIndex);
    }
    
    /**
     * 匹配词库
     * @param $word
     * @return bool
     */
    protected function _findWord($word) {
        $word = trim($word);
    
        if(empty($word)) {
            return false;
        }
    
        $len = mb_strlen($word);
    
        if(!isset($this->_wordsMap[$len])) {
            return false;
        }
    
        return in_array($word, $this->_wordsMap[$len]);
    }
    
    /**
     * @param $ary
     * @param $len
     * @param $bpos
     */
    protected function _removeRelativeWords(&$ary, $len, $bpos) {
        foreach ($ary as $alen => $rows) {
            foreach ($rows as $apos => $row) {
                $aepos = $apos+$alen-1;
            
                if( ($apos >= $bpos && $apos < $bpos+$len)
                    || ($aepos >= $bpos && $aepos < $bpos+$len)
                    || ($bpos >= $apos && $bpos < $apos+$alen)) {
                    unset($ary[$alen][$apos]);
                
                    if(empty($ary[$alen])) {
                        unset($ary[$alen]);
                    }
                }
            }
        }
    }
    
    protected function _mergeResult($foundWords, $remainingWords) {
        $result = [];
    
        foreach($foundWords as $i => $rows) {
            $rows = array_map(function($r) {return ['word' => $r, 'found' => 1];}, $rows);
            $result = $result + $rows;
        }
    
        foreach($remainingWords as $i => $rows) {
            $rows = array_map(function($r) {return ['word' => $r, 'found' => 0];}, $rows);
            $result = $result + $rows;
        }
    
        ksort($result);
        return $result;
    }
}


$tokenizer = new ChineseWordTokenizer('data2.txt', true, "\t");
$tokenizer->separatedFromLetters(true);
$result = $tokenizer->compute(
    '中华人民共和国位于亚洲东部，太平洋西岸，是工人阶级领导的、以工农联盟为基础的人民民主专政的社会主义国家.
    中华人民共和国英文缩写为 PRC，全称为People\'s Republic of China.
    CHN是中国(CHINA)的缩写，CHN是在联合国注册的国家代码，国际会议、体育比赛等正式场合代表国家时都用这种统一的国家代码。在网络域名中则以.cn作为缩写
');

print_r($result);